This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents.
library(dplyr)
library(readr)
# Load data
path <- "/mnt/Data/deidentified_data/full_cohort/preprocessed_data_long.csv"
data <- read_csv(file=path, col_names=TRUE, col_select=-1, na=c("", "NA"))
## New names:
## Rows: 22096 Columns: 95
## ── Column specification
## ──────────────────────────────────────────────────────────── Delimiter: "," chr
## (4): sex, race, smoking_status, substance dbl (60): RandID, mort_age, visit_number,
## time_diff, age, height, weight, dlco,... lgl (28): has_copd, has_ct, has_hosp,
## pr_complete_ever, home_oxygen_ever, hosp_... date (3): diagnosis_date,
## pr_start_date, date
## ℹ Use `spec()` to retrieve the full column specification for this data. ℹ Specify
## the column types or set `show_col_types = FALSE` to quiet this message.
## • `` -> `...1`
# Get only required columns
ids <- c("RandID", "visit_number",
"has_copd", "has_hosp",
"date", "time_diff",
"substance")
preds_cts = c(
"age", # Months
"height", # m
"weight", # kg
"bmi", # kg/m^2
"fev1", # L
"fvc", # L
"fivc", # L
"fev1_fvc", # %
"fev1_fev6", # %
"pef", # L/s
"mmef", # L/s
"dlco",
"spo2"
)
preds_bin = c("hosp_past_year", "sex") # Binary predictors
preds_categ = c("smoking_status", "mmrc") # Categorical
outcomes_bin <- c(
"hosp_1_year", "hosp_3_year", "hosp_5_year",
"mort_1_year", "mort_3_year", "mort_5_year"
)
outcomes_surv <- c(
"mort_next_time", # Time in months
"hosp_next_time" # Time in months
)
preds <- c( preds_bin, preds_cts, preds_categ)
outcomes <- c(outcomes_bin, outcomes_surv)
data <- data %>% select(all_of(c(ids, preds, outcomes)))
head(data)
## # A tibble: 6 × 32
## RandID visit_number has_copd has_hosp date time_diff substance
## <dbl> <dbl> <lgl> <lgl> <date> <dbl> <chr>
## 1 1 0 TRUE TRUE 2009-06-01 0 ventolin
## 2 1 1 TRUE TRUE 2013-08-01 50.0 ventolin
## 3 1 2 TRUE TRUE 2019-12-01 126. salbutamol
## 4 4 0 TRUE TRUE 2009-12-01 0 ventolin
## 5 6 0 TRUE TRUE 2019-08-01 0 salbutamol
## 6 7 0 TRUE FALSE 2019-03-01 0 salbutamol
## # ℹ 25 more variables: hosp_past_year <lgl>, sex <chr>, age <dbl>, height <dbl>,
## # weight <dbl>, bmi <dbl>, fev1 <dbl>, fvc <dbl>, fivc <dbl>, fev1_fvc <dbl>,
## # fev1_fev6 <dbl>, pef <dbl>, mmef <dbl>, dlco <dbl>, spo2 <dbl>,
## # smoking_status <chr>, mmrc <dbl>, hosp_1_year <lgl>, hosp_3_year <lgl>,
## # hosp_5_year <lgl>, mort_1_year <lgl>, mort_3_year <lgl>, mort_5_year <lgl>,
## # mort_next_time <dbl>, hosp_next_time <dbl>
library(naniar)
vis_miss(data %>% select(all_of(preds)))
gg_miss_var(data %>% select(preds))
sprintf("Pct missing cases (all preds): %f",
pct_miss_case(data %>% select(all_of(preds))))
## [1] "Pct missing cases (all preds): 89.930304"
sprintf("Pct missing cases (exclude dlco, fivc): %f",
pct_miss_case(data %>%
select(all_of(preds), -dlco, -fivc)))
## [1] "Pct missing cases (exclude dlco, fivc): 41.066256"
gg_miss_upset(data %>% select(all_of(preds)))
gg_miss_fct(x=data%>%select(all_of(preds)), fct=smoking_status)
gg_miss_fct(x=data%>%select(all_of(preds)), fct=mmrc)
## Warning: Removed 16 rows containing missing values or values outside the scale range
## (`geom_tile()`).
mmrc and spo2 both having highly correlated missing values suggests that
they are often measured as a pair (though the presence of mMRC values
having non-na spo2 implies other things).
gg_miss_fct(x=data%>%select(all_of(preds)), fct=sex)
gg_miss_fct(x=data%>%select(all_of(preds)), fct=hosp_past_year)
hosp_past_year highlights some potentially interesting effects where hospitalisations in prior years increase the rate of having mmrc and dlco. It is possible these were measured in previous hospitalisations and they have been carried forward.
Also interested in change in missingness vs date
library(ggplot2)
data <- data %>%
mutate(date = as.numeric(difftime(date, as.Date("2006-01-01"), units="days")))
ggplot(data, aes(x=date, y=spo2)) + geom_miss_point(alpha=0.01)
ggplot(data, aes(x=date, y=mmef)) + geom_miss_point(alpha=0.01)
ggplot(data, aes(x=date, y=fivc)) + geom_miss_point(alpha=0.01)
ggplot(data, aes(x=date, y=dlco)) + geom_miss_point(alpha=0.01)
ggplot(data, aes(x=date, y=mmrc)) + geom_miss_point(alpha=0.01)
ggplot(data, aes(x=date, y=smoking_status)) + geom_miss_point(alpha=0.01)
ggplot(data, aes(x=date, y=fev1_fev6)) + geom_miss_point(alpha=0.01)
ggplot(data, aes(x=date, y=height)) + geom_miss_point(alpha=0.01)
ggplot(data, aes(x=date, y=weight)) + geom_miss_point(alpha=0.01)
ggplot(data, aes(x=date, y=bmi)) + geom_miss_point(alpha=0.1)
data %>% select(smoking_status)
## # A tibble: 22,096 × 1
## smoking_status
## <chr>
## 1 current
## 2 current
## 3 current
## 4 <NA>
## 5 ex
## 6 ex
## 7 ex
## 8 ex
## 9 ex
## 10 ex
## # ℹ 22,086 more rows
ggplot(data, aes(x=log(101-spo2))) + geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 7499 rows containing non-finite outside the scale range
## (`stat_bin()`).
ggplot(data, aes(x=spo2^2)) + geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 7499 rows containing non-finite outside the scale range
## (`stat_bin()`).
# histogram of dates
ggplot(data, aes(x=date)) + geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
Distribution is vaguely uniform.